{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "34ea4354",
   "metadata": {},
   "source": [
    "# BentoML Demo - IEEE-CIS Fraud Detection"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b73a62eb",
   "metadata": {},
   "source": [
    "Accept dataset rules on Kaggle before downloading: https://www.kaggle.com/competitions/ieee-fraud-detection/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f82d57c9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:50:58.429263Z",
     "iopub.status.busy": "2023-03-09T06:50:58.428906Z",
     "iopub.status.idle": "2023-03-09T06:50:58.442175Z",
     "shell.execute_reply": "2023-03-09T06:50:58.441777Z"
    }
   },
   "outputs": [],
   "source": [
    "# Set Kaggle Credentials for downloading dataset\n",
    "%env KAGGLE_USERNAME=\n",
    "%env KAGGLE_KEY="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "945f2734",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:50:58.444818Z",
     "iopub.status.busy": "2023-03-09T06:50:58.444485Z",
     "iopub.status.idle": "2023-03-09T06:50:58.986259Z",
     "shell.execute_reply": "2023-03-09T06:50:58.985255Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ieee-fraud-detection.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
      "Archive:  ieee-fraud-detection.zip\n",
      "  inflating: ./data/sample_submission.csv  \n",
      "  inflating: ./data/test_identity.csv  \n",
      "  inflating: ./data/test_transaction.csv  \n",
      "  inflating: ./data/train_identity.csv  \n",
      "  inflating: ./data/train_transaction.csv  \n"
     ]
    }
   ],
   "source": [
    "!kaggle competitions download -c ieee-fraud-detection\n",
    "!rm -rf ./data/\n",
    "!unzip -d ./data/ ieee-fraud-detection.zip && rm ieee-fraud-detection.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c49fd861",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:50:58.998981Z",
     "iopub.status.busy": "2023-03-09T06:50:58.995168Z",
     "iopub.status.idle": "2023-03-09T06:51:07.380130Z",
     "shell.execute_reply": "2023-03-09T06:51:07.379697Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "df_transactions = pd.read_csv(\"./data/train_transaction.csv\")\n",
    "\n",
    "X = df_transactions.drop(columns=[\"isFraud\"])\n",
    "y = df_transactions.isFraud"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75e1312a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:51:07.383150Z",
     "iopub.status.busy": "2023-03-09T06:51:07.382994Z",
     "iopub.status.idle": "2023-03-09T06:51:08.059790Z",
     "shell.execute_reply": "2023-03-09T06:51:08.058540Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.preprocessing import (\n",
    "    StandardScaler,\n",
    "    OneHotEncoder,\n",
    "    LabelEncoder,\n",
    "    OrdinalEncoder,\n",
    ")\n",
    "from sklearn.feature_selection import SelectPercentile, chi2\n",
    "\n",
    "numeric_features = df_transactions.select_dtypes(include=\"float64\").columns\n",
    "categorical_features = df_transactions.select_dtypes(include=\"object\").columns\n",
    "\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        (\"num\", SimpleImputer(strategy=\"median\"), numeric_features),\n",
    "        (\n",
    "            \"cat\",\n",
    "            OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=-1),\n",
    "            categorical_features,\n",
    "        ),\n",
    "    ],\n",
    "    verbose_feature_names_out=False,\n",
    "    remainder=\"passthrough\",\n",
    ")\n",
    "preprocessor.set_output(transform=\"pandas\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a0d3d70",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:51:08.063015Z",
     "iopub.status.busy": "2023-03-09T06:51:08.062784Z",
     "iopub.status.idle": "2023-03-09T06:51:24.798036Z",
     "shell.execute_reply": "2023-03-09T06:51:24.797677Z"
    }
   },
   "outputs": [],
   "source": [
    "X = preprocessor.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3efa93f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:51:24.800370Z",
     "iopub.status.busy": "2023-03-09T06:51:24.800220Z",
     "iopub.status.idle": "2023-03-09T06:51:25.734700Z",
     "shell.execute_reply": "2023-03-09T06:51:25.734322Z"
    }
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6e4b919",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:51:25.736827Z",
     "iopub.status.busy": "2023-03-09T06:51:25.736679Z",
     "iopub.status.idle": "2023-03-09T06:51:25.759545Z",
     "shell.execute_reply": "2023-03-09T06:51:25.759232Z"
    }
   },
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "\n",
    "\n",
    "def train(n_estimators, max_depth):\n",
    "    return xgb.XGBClassifier(\n",
    "        tree_method=\"hist\",\n",
    "        n_estimators=n_estimators,\n",
    "        max_depth=max_depth,\n",
    "        eval_metric=\"aucpr\",\n",
    "        objective=\"binary:logistic\",\n",
    "        enable_categorical=True,\n",
    "    ).fit(X_train, y_train, eval_set=[(X_test, y_test)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8944cd0",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:51:25.761463Z",
     "iopub.status.busy": "2023-03-09T06:51:25.761324Z",
     "iopub.status.idle": "2023-03-09T06:52:16.649884Z",
     "shell.execute_reply": "2023-03-09T06:52:16.649519Z"
    }
   },
   "outputs": [],
   "source": [
    "# small model with 300 gradient boosted trees and a maximum tree depth of 5\n",
    "model_sm = train(300, 5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c549dd2b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:52:16.652081Z",
     "iopub.status.busy": "2023-03-09T06:52:16.651916Z",
     "iopub.status.idle": "2023-03-09T06:52:17.047092Z",
     "shell.execute_reply": "2023-03-09T06:52:17.046693Z"
    }
   },
   "outputs": [],
   "source": [
    "import bentoml\n",
    "\n",
    "bentoml.xgboost.save_model(\n",
    "    \"ieee-fraud-detection-sm\",\n",
    "    model_sm,\n",
    "    signatures={\n",
    "        \"predict_proba\": {\"batchable\": True},\n",
    "    },\n",
    "    custom_objects={\"preprocessor\": preprocessor},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f22add54",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:52:17.049300Z",
     "iopub.status.busy": "2023-03-09T06:52:17.049115Z",
     "iopub.status.idle": "2023-03-09T06:52:17.053783Z",
     "shell.execute_reply": "2023-03-09T06:52:17.053478Z"
    }
   },
   "outputs": [],
   "source": [
    "model_ref = bentoml.xgboost.get(\"ieee-fraud-detection-sm:latest\")\n",
    "model_ref"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "efcf15eb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:52:17.055505Z",
     "iopub.status.busy": "2023-03-09T06:52:17.055384Z",
     "iopub.status.idle": "2023-03-09T06:52:23.709177Z",
     "shell.execute_reply": "2023-03-09T06:52:23.708889Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import bentoml\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "model_ref = bentoml.xgboost.get(\"ieee-fraud-detection-sm:latest\")\n",
    "model_runner = model_ref.to_runner()\n",
    "model_runner.init_local()\n",
    "model_preprocessor = model_ref.custom_objects[\"preprocessor\"]\n",
    "\n",
    "test_transactions = pd.read_csv(\"./data/test_transaction.csv\")[0:500]\n",
    "test_transactions = model_preprocessor.transform(test_transactions)\n",
    "result = model_runner.predict_proba.run(test_transactions)\n",
    "np.argmax(result, axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e3ac781",
   "metadata": {},
   "source": [
    "For the Inference Graph demo, let's train two additional models by tweaking the parameters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ba5780a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T06:52:23.711339Z",
     "iopub.status.busy": "2023-03-09T06:52:23.711233Z",
     "iopub.status.idle": "2023-03-09T07:03:17.724144Z",
     "shell.execute_reply": "2023-03-09T07:03:17.723796Z"
    }
   },
   "outputs": [],
   "source": [
    "# large model with 3000 gradient boosted trees and a maximum tree depth of 15\n",
    "model_lg = train(3000, 15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "318d54cb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T07:03:17.726234Z",
     "iopub.status.busy": "2023-03-09T07:03:17.726122Z",
     "iopub.status.idle": "2023-03-09T07:03:17.958983Z",
     "shell.execute_reply": "2023-03-09T07:03:17.958688Z"
    }
   },
   "outputs": [],
   "source": [
    "import bentoml\n",
    "\n",
    "bentoml.xgboost.save_model(\n",
    "    \"ieee-fraud-detection-lg\",\n",
    "    model_lg,\n",
    "    signatures={\n",
    "        \"predict_proba\": {\"batchable\": True},\n",
    "    },\n",
    "    custom_objects={\"preprocessor\": preprocessor},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "959d20ed",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T07:03:17.960692Z",
     "iopub.status.busy": "2023-03-09T07:03:17.960576Z",
     "iopub.status.idle": "2023-03-09T07:03:36.003487Z",
     "shell.execute_reply": "2023-03-09T07:03:36.003154Z"
    }
   },
   "outputs": [],
   "source": [
    "# tiny model with 300 gradient boosted trees and a maximum tree depth of 5\n",
    "model_tiny = train(100, 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e86102e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-09T07:03:36.005582Z",
     "iopub.status.busy": "2023-03-09T07:03:36.005448Z",
     "iopub.status.idle": "2023-03-09T07:03:36.150045Z",
     "shell.execute_reply": "2023-03-09T07:03:36.149710Z"
    }
   },
   "outputs": [],
   "source": [
    "import bentoml\n",
    "\n",
    "bentoml.xgboost.save_model(\n",
    "    \"ieee-fraud-detection-tiny\",\n",
    "    model_tiny,\n",
    "    signatures={\n",
    "        \"predict_proba\": {\"batchable\": True},\n",
    "    },\n",
    "    custom_objects={\"preprocessor\": preprocessor},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "664f1b4e",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
